import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MLPySpark").getOrCreate()
Importing Store.csv into pyspark dataframe.
df = spark.read.load("Store.csv", format="csv", sep=",", inferSchema="true", header="true")
Printing the schema of the PySpark Dataframe
df.printSchema()
root |-- Row ID: integer (nullable = true) |-- Order ID: string (nullable = true) |-- Order Date: string (nullable = true) |-- Ship Date: string (nullable = true) |-- Ship Mode: string (nullable = true) |-- Customer ID: string (nullable = true) |-- Customer Name: string (nullable = true) |-- Segment: string (nullable = true) |-- Country: string (nullable = true) |-- City_Code: string (nullable = true) |-- State_Code: string (nullable = true) |-- Postal Code: integer (nullable = true) |-- Product ID: string (nullable = true) |-- Category: string (nullable = true) |-- Sub-Category: string (nullable = true) |-- Sales: double (nullable = true) |-- Quantity: integer (nullable = true) |-- Discount: double (nullable = true) |-- Profit: double (nullable = true)
df.show()
+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------+----------+-----------+---------------+---------------+------------+--------+--------+--------+--------+ |Row ID| Order ID|Order Date| Ship Date| Ship Mode|Customer ID| Customer Name| Segment| Country|City_Code|State_Code|Postal Code| Product ID| Category|Sub-Category| Sales|Quantity|Discount| Profit| +------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------+----------+-----------+---------------+---------------+------------+--------+--------+--------+--------+ | 1|CA-2013-152156| 11/9/2013|11/12/2013| Second Class| CG-12520| Claire Gute| Consumer|United States| C259| S16| 42420|FUR-BO-10001798| Furniture| Bookcases| 261.96| 2| 0.0| 41.9136| | 2|CA-2013-152156| 11/9/2013|11/12/2013| Second Class| CG-12520| Claire Gute| Consumer|United States| C259| S16| 42420|FUR-CH-10000454| Furniture| Chairs| 731.94| 3| 0.0| 219.582| | 3|CA-2013-138688| 6/13/2013| 6/17/2013| Second Class| DV-13045| Darrin Van Huff| Corporate|United States| C64| S04| 90036|OFF-LA-10000240|Office Supplies| Labels| 14.62| 2| 0.0| 6.8714| | 4|US-2012-108966|10/11/2012|10/18/2012|Standard Class| SO-20335| Sean O'Donnell| Consumer|United States| C150| S09| 33311|FUR-TA-10000577| Furniture| Tables|957.5775| 5| 0.45|-383.031| | 5|US-2012-108966|10/11/2012|10/18/2012|Standard Class| SO-20335| Sean O'Donnell| Consumer|United States| C150| S09| 33311|OFF-ST-10000760|Office Supplies| Storage| 22.368| 2| 0.2| 2.5164| | 6|CA-2011-115812| 6/9/2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|FUR-FU-10001487| Furniture| Furnishings| 48.86| 7| 0.0| 14.1694| | 7|CA-2011-115812| 6/9/2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|OFF-AR-10002833|Office Supplies| Art| 7.28| 4| 0.0| 1.9656| | 8|CA-2011-115812| 6/9/2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|TEC-PH-10002275| Technology| Phones| 907.152| 6| 0.2| 90.7152| | 9|CA-2011-115812| 6/9/2011| 6/14/2011| null| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|OFF-BI-10003910|Office Supplies| Binders| 18.504| 3| 0.2| 5.7825| | 10|CA-2011-115812| 6/9/2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|OFF-AP-10002892|Office Supplies| Appliances| 114.9| 5| 0.0| 34.47| | 11|CA-2011-115812| 6/9/2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer| null| C64| S04| 90032|FUR-TA-10001539| Furniture| Tables|1706.184| 9| 0.2| 85.3092| | 12|CA-2011-115812| 6/9/2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|TEC-PH-10002033| Technology| Phones| 911.424| 4| 0.2| 68.3568| | 13|CA-2014-114412| 4/16/2014| 4/21/2014|Standard Class| AA-10480| Andrew Allen| Consumer|United States| C42| S32| 28027|OFF-PA-10002365|Office Supplies| Paper| 15.552| 3| 0.2| 5.4432| | 14|CA-2013-161389| 12/6/2013|12/11/2013|Standard Class| IM-15070| Irene Maddox| Consumer|United States| C587| S46| 98103|OFF-BI-10003656|Office Supplies| Binders| 407.976| 3| 0.2|132.5922| | 15|US-2012-118983|11/22/2012|11/26/2012|Standard Class| HP-14815| Harold Pawlan|Home Office|United States| C515| S42| 76106|OFF-AP-10002311|Office Supplies| Appliances| 68.81| 5| 0.8|-123.858| | 16|US-2012-118983|11/22/2012|11/26/2012|Standard Class| HP-14815| Harold Pawlan|Home Office|United States| C515| S42| 76106|OFF-BI-10000756|Office Supplies| Binders| 2.544| 3| 0.8| -3.816| | 17|CA-2011-105893|11/11/2011|11/18/2011|Standard Class| PK-19075| Pete Kriz| Consumer|United States| C597| S48| 53711|OFF-ST-10004186|Office Supplies| Storage| 665.88| 6| 0.0| 13.3176| | 18|CA-2011-167164| 5/13/2011| 5/15/2011| Second Class| AG-10270| Alejandro Grove| Consumer|United States| C558| S43| 84084|OFF-ST-10000107|Office Supplies| Storage| 55.5| 2| 0.0| 9.99| | 19|CA-2011-143336| 8/27/2011| 9/1/2011| Second Class| ZD-21925|Zuschuss Donatelli| Consumer|United States| C91| S04| 94109|OFF-AR-10003056|Office Supplies| Art| 8.56| 2| 0.0| 2.4824| | 20|CA-2011-143336| 8/27/2011| 9/1/2011| Second Class| ZD-21925|Zuschuss Donatelli| Consumer|United States| C91| S04| 94109|TEC-PH-10001949| Technology| Phones| 213.48| 3| 0.2| 16.011| +------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------+----------+-----------+---------------+---------------+------------+--------+--------+--------+--------+ only showing top 20 rows
import pyspark.sql.functions as F
def count_missings(df,sort=True):
ndf = df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in df.dtypes]).toPandas()
if len(ndf) == 0:
print("There are no any missing values!")
return None
if sort:
return ndf.rename(index={0: 'count'}).T.sort_values("count",ascending=False)
return ndf
count_missings(df)
| count | |
|---|---|
| Ship Mode | 22 |
| Segment | 9 |
| Country | 6 |
| Row ID | 0 |
| Postal Code | 0 |
| Discount | 0 |
| Quantity | 0 |
| Sales | 0 |
| Sub-Category | 0 |
| Category | 0 |
| Product ID | 0 |
| City_Code | 0 |
| State_Code | 0 |
| Order ID | 0 |
| Customer Name | 0 |
| Customer ID | 0 |
| Ship Date | 0 |
| Order Date | 0 |
| Profit | 0 |
df = df.withColumn("Ship Mode", F.when(F.col("Ship Mode").isNull(),'Standard Class').otherwise(F.col("Ship Mode")))
df = df.withColumn("Country", F.when(F.col("Country").isNull(),'United States').otherwise(F.col("Country")))
df=df.na.drop()
count_missings(df)
| count | |
|---|---|
| Row ID | 0 |
| State_Code | 0 |
| Discount | 0 |
| Quantity | 0 |
| Sales | 0 |
| Sub-Category | 0 |
| Category | 0 |
| Product ID | 0 |
| Postal Code | 0 |
| City_Code | 0 |
| Order ID | 0 |
| Country | 0 |
| Segment | 0 |
| Customer Name | 0 |
| Customer ID | 0 |
| Ship Mode | 0 |
| Ship Date | 0 |
| Order Date | 0 |
| Profit | 0 |
from pyspark.sql.types import IntegerType
df = df.withColumn("Sales", df['Sales'].cast(IntegerType()))
df.printSchema()
root |-- Row ID: integer (nullable = true) |-- Order ID: string (nullable = true) |-- Order Date: string (nullable = true) |-- Ship Date: string (nullable = true) |-- Ship Mode: string (nullable = true) |-- Customer ID: string (nullable = true) |-- Customer Name: string (nullable = true) |-- Segment: string (nullable = true) |-- Country: string (nullable = true) |-- City_Code: string (nullable = true) |-- State_Code: string (nullable = true) |-- Postal Code: integer (nullable = true) |-- Product ID: string (nullable = true) |-- Category: string (nullable = true) |-- Sub-Category: string (nullable = true) |-- Sales: integer (nullable = true) |-- Quantity: integer (nullable = true) |-- Discount: double (nullable = true) |-- Profit: double (nullable = true)
cols = ['Sales']
bounds ={}
for col in cols:
quantiles = df.approxQuantile(col, [0.25, 0.75], 0.05)
IQR = quantiles[1] - quantiles[0]
bounds[col] = [quantiles[0] - 1.5 * IQR, quantiles[1] + 1.5 * IQR]
bounds
{'Sales': [-214.0, 402.0]}
outliers = df.select(*['Row ID'] +[
(
(df[c] < bounds[c][0]) |
(df[c] > bounds[c][1])
).alias(c + '_o') for c in cols
])
outliers.show()
+------+-------+ |Row ID|Sales_o| +------+-------+ | 1| false| | 2| true| | 3| false| | 4| true| | 5| false| | 6| false| | 7| false| | 8| true| | 9| false| | 10| false| | 11| true| | 12| true| | 13| false| | 14| true| | 15| false| | 16| false| | 17| true| | 18| false| | 19| false| | 20| false| +------+-------+ only showing top 20 rows
upper = 402
lower = -214
from pyspark.sql import functions as F, Window
df = df.withColumn(
'Sales',
F.when(
(df['Sales'] > upper) | (df['Sales'] < lower),
F.round(F.mean('Sales').over(Window.orderBy(F.lit(1)))).cast('int')
).otherwise(F.col('Sales'))
)
df.show()
+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------+----------+-----------+---------------+---------------+------------+-----+--------+--------+--------+ |Row ID| Order ID|Order Date| Ship Date| Ship Mode|Customer ID| Customer Name| Segment| Country|City_Code|State_Code|Postal Code| Product ID| Category|Sub-Category|Sales|Quantity|Discount| Profit| +------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------+----------+-----------+---------------+---------------+------------+-----+--------+--------+--------+ | 1|CA-2013-152156| 11/9/2013|11/12/2013| Second Class| CG-12520| Claire Gute| Consumer|United States| C259| S16| 42420|FUR-BO-10001798| Furniture| Bookcases| 261| 2| 0.0| 41.9136| | 2|CA-2013-152156| 11/9/2013|11/12/2013| Second Class| CG-12520| Claire Gute| Consumer|United States| C259| S16| 42420|FUR-CH-10000454| Furniture| Chairs| 229| 3| 0.0| 219.582| | 3|CA-2013-138688| 6/13/2013| 6/17/2013| Second Class| DV-13045| Darrin Van Huff| Corporate|United States| C64| S04| 90036|OFF-LA-10000240|Office Supplies| Labels| 14| 2| 0.0| 6.8714| | 4|US-2012-108966|10/11/2012|10/18/2012|Standard Class| SO-20335| Sean O'Donnell| Consumer|United States| C150| S09| 33311|FUR-TA-10000577| Furniture| Tables| 229| 5| 0.45|-383.031| | 5|US-2012-108966|10/11/2012|10/18/2012|Standard Class| SO-20335| Sean O'Donnell| Consumer|United States| C150| S09| 33311|OFF-ST-10000760|Office Supplies| Storage| 22| 2| 0.2| 2.5164| | 6|CA-2011-115812| 6/9/2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|FUR-FU-10001487| Furniture| Furnishings| 48| 7| 0.0| 14.1694| | 7|CA-2011-115812| 6/9/2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|OFF-AR-10002833|Office Supplies| Art| 7| 4| 0.0| 1.9656| | 8|CA-2011-115812| 6/9/2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|TEC-PH-10002275| Technology| Phones| 229| 6| 0.2| 90.7152| | 9|CA-2011-115812| 6/9/2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|OFF-BI-10003910|Office Supplies| Binders| 18| 3| 0.2| 5.7825| | 10|CA-2011-115812| 6/9/2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|OFF-AP-10002892|Office Supplies| Appliances| 114| 5| 0.0| 34.47| | 11|CA-2011-115812| 6/9/2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|FUR-TA-10001539| Furniture| Tables| 229| 9| 0.2| 85.3092| | 12|CA-2011-115812| 6/9/2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|TEC-PH-10002033| Technology| Phones| 229| 4| 0.2| 68.3568| | 13|CA-2014-114412| 4/16/2014| 4/21/2014|Standard Class| AA-10480| Andrew Allen| Consumer|United States| C42| S32| 28027|OFF-PA-10002365|Office Supplies| Paper| 15| 3| 0.2| 5.4432| | 14|CA-2013-161389| 12/6/2013|12/11/2013|Standard Class| IM-15070| Irene Maddox| Consumer|United States| C587| S46| 98103|OFF-BI-10003656|Office Supplies| Binders| 229| 3| 0.2|132.5922| | 15|US-2012-118983|11/22/2012|11/26/2012|Standard Class| HP-14815| Harold Pawlan|Home Office|United States| C515| S42| 76106|OFF-AP-10002311|Office Supplies| Appliances| 68| 5| 0.8|-123.858| | 16|US-2012-118983|11/22/2012|11/26/2012|Standard Class| HP-14815| Harold Pawlan|Home Office|United States| C515| S42| 76106|OFF-BI-10000756|Office Supplies| Binders| 2| 3| 0.8| -3.816| | 17|CA-2011-105893|11/11/2011|11/18/2011|Standard Class| PK-19075| Pete Kriz| Consumer|United States| C597| S48| 53711|OFF-ST-10004186|Office Supplies| Storage| 229| 6| 0.0| 13.3176| | 18|CA-2011-167164| 5/13/2011| 5/15/2011| Second Class| AG-10270| Alejandro Grove| Consumer|United States| C558| S43| 84084|OFF-ST-10000107|Office Supplies| Storage| 55| 2| 0.0| 9.99| | 19|CA-2011-143336| 8/27/2011| 9/1/2011| Second Class| ZD-21925|Zuschuss Donatelli| Consumer|United States| C91| S04| 94109|OFF-AR-10003056|Office Supplies| Art| 8| 2| 0.0| 2.4824| | 20|CA-2011-143336| 8/27/2011| 9/1/2011| Second Class| ZD-21925|Zuschuss Donatelli| Consumer|United States| C91| S04| 94109|TEC-PH-10001949| Technology| Phones| 213| 3| 0.2| 16.011| +------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------+----------+-----------+---------------+---------------+------------+-----+--------+--------+--------+ only showing top 20 rows
The outliers are fixed by replacing the value with mean value.
from pyspark.sql.functions import *
df = df.withColumn('Order Date', regexp_replace('Order Date','/','-'))
df.show()
+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------+----------+-----------+---------------+---------------+------------+-----+--------+--------+--------+ |Row ID| Order ID|Order Date| Ship Date| Ship Mode|Customer ID| Customer Name| Segment| Country|City_Code|State_Code|Postal Code| Product ID| Category|Sub-Category|Sales|Quantity|Discount| Profit| +------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------+----------+-----------+---------------+---------------+------------+-----+--------+--------+--------+ | 1|CA-2013-152156| 11-9-2013|11/12/2013| Second Class| CG-12520| Claire Gute| Consumer|United States| C259| S16| 42420|FUR-BO-10001798| Furniture| Bookcases| 261| 2| 0.0| 41.9136| | 2|CA-2013-152156| 11-9-2013|11/12/2013| Second Class| CG-12520| Claire Gute| Consumer|United States| C259| S16| 42420|FUR-CH-10000454| Furniture| Chairs| 229| 3| 0.0| 219.582| | 3|CA-2013-138688| 6-13-2013| 6/17/2013| Second Class| DV-13045| Darrin Van Huff| Corporate|United States| C64| S04| 90036|OFF-LA-10000240|Office Supplies| Labels| 14| 2| 0.0| 6.8714| | 4|US-2012-108966|10-11-2012|10/18/2012|Standard Class| SO-20335| Sean O'Donnell| Consumer|United States| C150| S09| 33311|FUR-TA-10000577| Furniture| Tables| 229| 5| 0.45|-383.031| | 5|US-2012-108966|10-11-2012|10/18/2012|Standard Class| SO-20335| Sean O'Donnell| Consumer|United States| C150| S09| 33311|OFF-ST-10000760|Office Supplies| Storage| 22| 2| 0.2| 2.5164| | 6|CA-2011-115812| 6-9-2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|FUR-FU-10001487| Furniture| Furnishings| 48| 7| 0.0| 14.1694| | 7|CA-2011-115812| 6-9-2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|OFF-AR-10002833|Office Supplies| Art| 7| 4| 0.0| 1.9656| | 8|CA-2011-115812| 6-9-2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|TEC-PH-10002275| Technology| Phones| 229| 6| 0.2| 90.7152| | 9|CA-2011-115812| 6-9-2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|OFF-BI-10003910|Office Supplies| Binders| 18| 3| 0.2| 5.7825| | 10|CA-2011-115812| 6-9-2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|OFF-AP-10002892|Office Supplies| Appliances| 114| 5| 0.0| 34.47| | 11|CA-2011-115812| 6-9-2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|FUR-TA-10001539| Furniture| Tables| 229| 9| 0.2| 85.3092| | 12|CA-2011-115812| 6-9-2011| 6/14/2011|Standard Class| BH-11710| Brosina Hoffman| Consumer|United States| C64| S04| 90032|TEC-PH-10002033| Technology| Phones| 229| 4| 0.2| 68.3568| | 13|CA-2014-114412| 4-16-2014| 4/21/2014|Standard Class| AA-10480| Andrew Allen| Consumer|United States| C42| S32| 28027|OFF-PA-10002365|Office Supplies| Paper| 15| 3| 0.2| 5.4432| | 14|CA-2013-161389| 12-6-2013|12/11/2013|Standard Class| IM-15070| Irene Maddox| Consumer|United States| C587| S46| 98103|OFF-BI-10003656|Office Supplies| Binders| 229| 3| 0.2|132.5922| | 15|US-2012-118983|11-22-2012|11/26/2012|Standard Class| HP-14815| Harold Pawlan|Home Office|United States| C515| S42| 76106|OFF-AP-10002311|Office Supplies| Appliances| 68| 5| 0.8|-123.858| | 16|US-2012-118983|11-22-2012|11/26/2012|Standard Class| HP-14815| Harold Pawlan|Home Office|United States| C515| S42| 76106|OFF-BI-10000756|Office Supplies| Binders| 2| 3| 0.8| -3.816| | 17|CA-2011-105893|11-11-2011|11/18/2011|Standard Class| PK-19075| Pete Kriz| Consumer|United States| C597| S48| 53711|OFF-ST-10004186|Office Supplies| Storage| 229| 6| 0.0| 13.3176| | 18|CA-2011-167164| 5-13-2011| 5/15/2011| Second Class| AG-10270| Alejandro Grove| Consumer|United States| C558| S43| 84084|OFF-ST-10000107|Office Supplies| Storage| 55| 2| 0.0| 9.99| | 19|CA-2011-143336| 8-27-2011| 9/1/2011| Second Class| ZD-21925|Zuschuss Donatelli| Consumer|United States| C91| S04| 94109|OFF-AR-10003056|Office Supplies| Art| 8| 2| 0.0| 2.4824| | 20|CA-2011-143336| 8-27-2011| 9/1/2011| Second Class| ZD-21925|Zuschuss Donatelli| Consumer|United States| C91| S04| 94109|TEC-PH-10001949| Technology| Phones| 213| 3| 0.2| 16.011| +------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------+----------+-----------+---------------+---------------+------------+-----+--------+--------+--------+ only showing top 20 rows
df.withColumnRenamed("Order Date", "Order_Date").printSchema()
root |-- Row ID: integer (nullable = true) |-- Order ID: string (nullable = true) |-- Order_Date: string (nullable = true) |-- Ship Date: string (nullable = true) |-- Ship Mode: string (nullable = true) |-- Customer ID: string (nullable = true) |-- Customer Name: string (nullable = true) |-- Segment: string (nullable = true) |-- Country: string (nullable = true) |-- City_Code: string (nullable = true) |-- State_Code: string (nullable = true) |-- Postal Code: integer (nullable = true) |-- Product ID: string (nullable = true) |-- Category: string (nullable = true) |-- Sub-Category: string (nullable = true) |-- Sales: integer (nullable = true) |-- Quantity: integer (nullable = true) |-- Discount: double (nullable = true) |-- Profit: double (nullable = true)
from pyspark.sql.functions import to_date
df.withColumn('Order Date', to_date('Order Date'))
DataFrame[Row ID: int, Order ID: string, Order Date: date, Ship Date: string, Ship Mode: string, Customer ID: string, Customer Name: string, Segment: string, Country: string, City_Code: string, State_Code: string, Postal Code: int, Product ID: string, Category: string, Sub-Category: string, Sales: int, Quantity: int, Discount: double, Profit: double]
train = spark.read.load("tSTrain.csv", format="csv", sep=",", inferSchema="true", header="true")
test = spark.read.load("tSTest.csv", format="csv", sep=",", inferSchema="true", header="true")
train.show()
+-------------------+------------------+ | Order_Date| Sales| +-------------------+------------------+ |2011-01-31 00:00:00|10274.951000000001| |2011-02-28 00:00:00| 4127.044| |2011-03-31 00:00:00|23906.656000000003| |2011-04-30 00:00:00|19937.923000000003| |2011-05-31 00:00:00|17323.601000000002| |2011-06-30 00:00:00|22346.234600000003| |2011-07-31 00:00:00| 21725.115| |2011-08-31 00:00:00| 21955.2065| |2011-09-30 00:00:00|40155.516800000005| |2011-10-31 00:00:00| 22765.181| |2011-11-30 00:00:00| 52972.64570000001| |2011-12-31 00:00:00|44318.414500000006| |2012-01-31 00:00:00| 9030.27| |2012-02-29 00:00:00| 9266.875| |2012-03-31 00:00:00|21324.827600000004| |2012-04-30 00:00:00| 25107.8365| |2012-05-31 00:00:00|20411.235500000003| |2012-06-30 00:00:00| 18222.791| |2012-07-31 00:00:00|22652.713000000003| |2012-08-31 00:00:00|26155.326200000003| +-------------------+------------------+ only showing top 20 rows
train.describe()
DataFrame[summary: string, Sales: string]
test.show()
+-------------------+------------------+ | Order_Date| Sales| +-------------------+------------------+ |2014-07-31 00:00:00| 35732.352| |2014-08-31 00:00:00| 34269.544| |2014-09-30 00:00:00|63690.990000000005| |2014-10-31 00:00:00|41701.650200000004| |2014-11-30 00:00:00| 67037.914| |2014-12-31 00:00:00| 67413.75480000001| +-------------------+------------------+
test.describe()
DataFrame[summary: string, Sales: string]
train = train.toPandas()
train.set_index('Order_Date')
train
| Order_Date | Sales | |
|---|---|---|
| 0 | 2011-01-31 | 10274.9510 |
| 1 | 2011-02-28 | 4127.0440 |
| 2 | 2011-03-31 | 23906.6560 |
| 3 | 2011-04-30 | 19937.9230 |
| 4 | 2011-05-31 | 17323.6010 |
| 5 | 2011-06-30 | 22346.2346 |
| 6 | 2011-07-31 | 21725.1150 |
| 7 | 2011-08-31 | 21955.2065 |
| 8 | 2011-09-30 | 40155.5168 |
| 9 | 2011-10-31 | 22765.1810 |
| 10 | 2011-11-30 | 52972.6457 |
| 11 | 2011-12-31 | 44318.4145 |
| 12 | 2012-01-31 | 9030.2700 |
| 13 | 2012-02-29 | 9266.8750 |
| 14 | 2012-03-31 | 21324.8276 |
| 15 | 2012-04-30 | 25107.8365 |
| 16 | 2012-05-31 | 20411.2355 |
| 17 | 2012-06-30 | 18222.7910 |
| 18 | 2012-07-31 | 22652.7130 |
| 19 | 2012-08-31 | 26155.3262 |
| 20 | 2012-09-30 | 44318.3680 |
| 21 | 2012-10-31 | 24492.0380 |
| 22 | 2012-11-30 | 52883.7593 |
| 23 | 2012-12-31 | 47967.2752 |
| 24 | 2013-01-31 | 13090.6050 |
| 25 | 2013-02-28 | 13012.8350 |
| 26 | 2013-03-31 | 28211.7270 |
| 27 | 2013-04-30 | 23513.1070 |
| 28 | 2013-05-31 | 30296.1310 |
| 29 | 2013-06-30 | 28411.5570 |
| 30 | 2013-07-31 | 28132.1440 |
| 31 | 2013-08-31 | 25231.3223 |
| 32 | 2013-09-30 | 50811.9075 |
| 33 | 2013-10-31 | 24950.6380 |
| 34 | 2013-11-30 | 54874.1050 |
| 35 | 2013-12-31 | 61319.5990 |
| 36 | 2014-01-31 | 22807.8670 |
| 37 | 2014-02-28 | 16179.8594 |
| 38 | 2014-03-31 | 32729.5220 |
| 39 | 2014-04-30 | 25353.7834 |
| 40 | 2014-05-31 | 35201.3374 |
| 41 | 2014-06-30 | 34349.3592 |
test = test.toPandas()
test.set_index('Order_Date')
test
| Order_Date | Sales | |
|---|---|---|
| 0 | 2014-07-31 | 35732.3520 |
| 1 | 2014-08-31 | 34269.5440 |
| 2 | 2014-09-30 | 63690.9900 |
| 3 | 2014-10-31 | 41701.6502 |
| 4 | 2014-11-30 | 67037.9140 |
| 5 | 2014-12-31 | 67413.7548 |
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
import pandas as pd
train.plot(x='Order_Date', y='Sales', figsize=(18,6))
<matplotlib.axes._subplots.AxesSubplot at 0x7f1f0b54e9e8>
train.columns = ['ds', 'y']
from fbprophet import Prophet
m = Prophet()
m.fit(train)
future = m.make_future_dataframe(periods = 60, freq='MS')
forecast = m.predict(future)
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
forecast.head()
| ds | trend | trend_lower | trend_upper | yhat_lower | yhat_upper | additive_terms | additive_terms_lower | additive_terms_upper | multiplicative_terms | multiplicative_terms_lower | multiplicative_terms_upper | yearly | yearly_lower | yearly_upper | yhat | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-31 | 23356.314651 | 23356.314651 | 23356.314651 | 5873.906383 | 11707.171410 | -14586.424311 | -14586.424311 | -14586.424311 | 0.0 | 0.0 | 0.0 | -14586.424311 | -14586.424311 | -14586.424311 | 8769.890340 |
| 1 | 2011-02-28 | 23560.158928 | 23560.158928 | 23560.158928 | 1817.030631 | 7534.961259 | -19057.407919 | -19057.407919 | -19057.407919 | 0.0 | 0.0 | 0.0 | -19057.407919 | -19057.407919 | -19057.407919 | 4502.751009 |
| 2 | 2011-03-31 | 23785.843666 | 23785.843666 | 23785.843666 | 20913.445554 | 26759.945274 | -35.708268 | -35.708268 | -35.708268 | 0.0 | 0.0 | 0.0 | -35.708268 | -35.708268 | -35.708268 | 23750.135398 |
| 3 | 2011-04-30 | 24004.248252 | 24004.248252 | 24004.248252 | 13656.216065 | 19431.051086 | -7551.796980 | -7551.796980 | -7551.796980 | 0.0 | 0.0 | 0.0 | -7551.796980 | -7551.796980 | -7551.796980 | 16452.451272 |
| 4 | 2011-05-31 | 24229.932988 | 24229.932988 | 24229.932988 | 16977.563801 | 22622.379691 | -4420.873255 | -4420.873255 | -4420.873255 | 0.0 | 0.0 | 0.0 | -4420.873255 | -4420.873255 | -4420.873255 | 19809.059733 |
from fbprophet.plot import plot_plotly, plot_components_plotly
import plotly.offline as py
import plotly.graph_objects as go
py.init_notebook_mode()
fig = plot_plotly(m, forecast)
py.iplot(fig)
plot_components_plotly(m, forecast)
from statsmodels.tools.eval_measures import rmse
predictions = forecast.iloc[43:49]['yhat']
print("Root Mean Squared Error between actual and predicted values: ",rmse(predictions, test['Sales']))
Root Mean Squared Error between actual and predicted values: 6825.492364406342
As we can see the model is doing good. We implemented the Prophet Model which is a model we didnt try out before in python itself as in the PySpark Library we coudnt identify any Module for TimeSeries Algorithms.